The main purpose of our project is to predict 6 weeks of daily sales of Rossmann for 1,115 stores located across Germany on Kaggle competition.

1. PREPROCESSING AND PARTITIONING

The dataset consists with 10e6 rows x 13 columns. For the purpose of the project we added historical dataset “Store” and mergerd two datasets by the variable “Store”. In the next stage step such variables as month, year, week were created. Finally, we got the main dataset with size 384895 observations (rows) and 28 variables (columns).

packages <- c('lmtest','forecast','xts','fBasics','urca','lubridate','tidyr',
              'dplyr','readr','data.table','ggplot2','zoo')
for(i in packages){
  if(i %in% rownames(installed.packages())){
    suppressWarnings(suppressMessages(library(i,character.only = T)))
  }else{
    install.packages(i, character.only = T)
    suppressWarnings(library(i))
  }
}
rossmann <- data.table::fread("all/train.csv")
store <- data.table::fread("all/store.csv")
rossmann <- merge(rossmann, store, by = "Store")
rossmann$Date <- as.Date(rossmann$Date, "%Y-%m-%d")
rossmann$Month <- month(rossmann$Date)
rossmann$Year <- year(rossmann$Date)
# create data for ARIMA analysis
rossmann1 <- rossmann
rossmann <- rossmann%>%
  arrange(Store,Date)
## Warning: package 'bindrcpp' was built under R version 3.4.4

Describtions of variables:

Sales: the turnover for any given day (this is what you are predicting) Customers: the number of customers on a given day Open: an indicator for whether the store was open: 0 = closed, 1 = open StateHoliday: indicates a state holiday. Normally all stores, with few exceptions, are closed on state holidays. Note that all schools are closed on public holidays and weekends. a = public holiday, b = Easter holiday, c = Christmas, 0 = None SchoolHoliday: indicates if the (Store, Date) was affected by the closure of public schools StoreType: differentiates between 4 different store models: a, b, c, d Promo: indicates whether a store is running a promo on that day DayOfWeek Date Assortment: describes an assortment level: a = basic, b = extra, c = extended CompetitionDistance: distance in meters to the nearest competitor store CompetitionOpenSince: gives the approximate year and month of the time the nearest competitor was opened Promo2 - Promo2 is a continuing and consecutive promotion for some stores: 0 = store is not participating, 1 = store is participating PromoInterval: describes the consecutive intervals Promo2 is started, naming the months the promotion is started anew. E.g. “Feb,May,Aug,Nov” means each round starts in February, May, August, November of any given year for that store.

# Change the type of some variables
rossmann$Store <- as.numeric(rossmann$Store)
rossmann$DayOfWeek <- as.numeric(rossmann$DayOfWeek)
rossmann$Sales <- as.numeric(rossmann$Sales)
rossmann$Customers <- as.numeric(rossmann$Customers)
rossmann$Open <- as.numeric(rossmann$Open)
rossmann$Promo <- as.numeric(rossmann$Promo)
rossmann$StateHoliday <- as.factor(rossmann$StateHoliday)
rossmann$SchoolHoliday <- as.numeric(rossmann$SchoolHoliday)
rossmann$StoreType <- as.factor(rossmann$StoreType)
rossmann$Assortment <- as.factor(rossmann$Assortment)
rossmann$CompetitionDistance <- as.numeric(rossmann$CompetitionDistance)
rossmann$CompetitionOpenSinceMonth <- as.numeric(rossmann$CompetitionOpenSinceMonth)
rossmann$CompetitionOpenSinceYear <- as.numeric(rossmann$CompetitionOpenSinceYear)
rossmann$Promo2 <- as.numeric(rossmann$Promo2)
rossmann$Promo2SinceWeek <- as.numeric(rossmann$Promo2SinceWeek)
rossmann$Promo2SinceYear <- as.numeric(rossmann$Promo2SinceYear)
rossmann$PromoInterval <- as.factor(rossmann$PromoInterval)
rossmann$Month <- as.numeric(rossmann$Month)
rossmann$Year <- as.numeric(rossmann$Year)
summary(rossmann)
##      Store          DayOfWeek          Date                Sales      
##  Min.   :   1.0   Min.   :1.000   Min.   :2013-01-01   Min.   :    0  
##  1st Qu.: 280.0   1st Qu.:2.000   1st Qu.:2013-08-17   1st Qu.: 3727  
##  Median : 558.0   Median :4.000   Median :2014-04-02   Median : 5744  
##  Mean   : 558.4   Mean   :3.998   Mean   :2014-04-11   Mean   : 5774  
##  3rd Qu.: 838.0   3rd Qu.:6.000   3rd Qu.:2014-12-12   3rd Qu.: 7856  
##  Max.   :1115.0   Max.   :7.000   Max.   :2015-07-31   Max.   :41551  
##                                                                       
##    Customers           Open            Promo        StateHoliday
##  Min.   :   0.0   Min.   :0.0000   Min.   :0.0000   0:986159    
##  1st Qu.: 405.0   1st Qu.:1.0000   1st Qu.:0.0000   a: 20260    
##  Median : 609.0   Median :1.0000   Median :0.0000   b:  6690    
##  Mean   : 633.1   Mean   :0.8301   Mean   :0.3815   c:  4100    
##  3rd Qu.: 837.0   3rd Qu.:1.0000   3rd Qu.:1.0000               
##  Max.   :7388.0   Max.   :1.0000   Max.   :1.0000               
##                                                                 
##  SchoolHoliday    StoreType  Assortment CompetitionDistance
##  Min.   :0.0000   a:551627   a:537445   Min.   :   20      
##  1st Qu.:0.0000   b: 15830   b:  8294   1st Qu.:  710      
##  Median :0.0000   c:136840   c:471470   Median : 2330      
##  Mean   :0.1786   d:312912              Mean   : 5430      
##  3rd Qu.:0.0000                         3rd Qu.: 6890      
##  Max.   :1.0000                         Max.   :75860      
##                                         NA's   :2642       
##  CompetitionOpenSinceMonth CompetitionOpenSinceYear     Promo2      
##  Min.   : 1.0              Min.   :1900             Min.   :0.0000  
##  1st Qu.: 4.0              1st Qu.:2006             1st Qu.:0.0000  
##  Median : 8.0              Median :2010             Median :1.0000  
##  Mean   : 7.2              Mean   :2009             Mean   :0.5006  
##  3rd Qu.:10.0              3rd Qu.:2013             3rd Qu.:1.0000  
##  Max.   :12.0              Max.   :2015             Max.   :1.0000  
##  NA's   :323348            NA's   :323348                           
##  Promo2SinceWeek  Promo2SinceYear           PromoInterval   
##  Min.   : 1.0     Min.   :2009                     :508031  
##  1st Qu.:13.0     1st Qu.:2011     Feb,May,Aug,Nov :118596  
##  Median :22.0     Median :2012     Jan,Apr,Jul,Oct :293122  
##  Mean   :23.3     Mean   :2012     Mar,Jun,Sept,Dec: 97460  
##  3rd Qu.:37.0     3rd Qu.:2013                              
##  Max.   :50.0     Max.   :2015                              
##  NA's   :508031   NA's   :508031                            
##      Month             Year     
##  Min.   : 1.000   Min.   :2013  
##  1st Qu.: 3.000   1st Qu.:2013  
##  Median : 6.000   Median :2014  
##  Mean   : 5.847   Mean   :2014  
##  3rd Qu.: 8.000   3rd Qu.:2014  
##  Max.   :12.000   Max.   :2015  
## 

1.1 Data missing issue

# investigate the distribution of the number of observation
# of stores
rossmann%>%group_by(Store)%>%
  summarise(days = n(),
            start_date = min(Date),
            end_date = max(Date))%>%
  group_by(start_date,end_date,days)%>%
  summarise(no_store = n())
## # A tibble: 3 x 4
## # Groups:   start_date, end_date [?]
##   start_date end_date    days no_store
##   <date>     <date>     <int>    <int>
## 1 2013-01-01 2015-07-31   758      180
## 2 2013-01-01 2015-07-31   942      934
## 3 2013-01-02 2015-07-31   941        1

What is interesting, that there are two groups of store with significant different number of observations (758 vs 942 observations). So, we decided to investigate further!

180 stores do not have the data during 6 months end of 2014.

Lets investigate sunday sales.

rossmann%>%
  filter(DayOfWeek==7)%>%
  group_by( Store)%>%
  summarise(Sales = sum(Sales))%>%
  ungroup()%>%
  mutate(Sunday_sales = ifelse(Sales >0,1,0))%>%
  group_by(Sunday_sales)%>%
  summarise(Store_count = n())
## # A tibble: 2 x 2
##   Sunday_sales Store_count
##          <dbl>       <int>
## 1            0        1082
## 2            1          33

33 stores have sunday sales. We added this attribute to the data set.

180 stores do not have the data during 6 months end of 2014.

Let’s see the number of stores by missing data and no sunday sales group.

#--> 180 stores do not have the data during 6 months end of 2014
rossmann%>%
  dplyr::filter(DayOfWeek==7)%>%
  group_by( Store)%>%
  summarise(Sales = sum(Sales))%>%
  ungroup()%>%
  mutate(Sunday_sales = ifelse(Sales >0,1,0))%>%
  group_by(Sunday_sales)%>%
  summarise(Store_count = n())
## # A tibble: 2 x 2
##   Sunday_sales Store_count
##          <dbl>       <int>
## 1            0        1082
## 2            1          33
#--> 30 store has Sunday sales
# let add this atribute to the data set
rossmann <-rossmann%>%
  dplyr::filter(DayOfWeek==7)%>%
  group_by( Store)%>%
  summarise(Sales = sum(Sales))%>%
  arrange(Sales)%>%
  ungroup()%>%
  mutate(Sunday_sales = ifelse(Sales >0,1,0))%>%
  select(-Sales)%>%
  left_join(rossmann, by = 'Store')%>%
  as.data.frame()
rossmann%>%filter(Store==13)%>%
  ggplot(aes(x=Date, y = Sales))+geom_line()

# number of stores by missing data and no sunday sales group  
rossmann%>%
  select(missing,Sunday_sales, Store)%>%
  unique()%>%
  group_by(missing,Sunday_sales)%>%
  summarise(store_count = n())
## # A tibble: 4 x 3
## # Groups:   missing [?]
##   missing Sunday_sales store_count
##     <dbl>        <dbl>       <int>
## 1       0            0         903
## 2       0            1          32
## 3       1            0         179
## 4       1            1           1

Convert second promotion to numerical variables, drop such variables as: PromoInterval,Promo2SinceYear,Promo2SinceWeek, CompetitionOpenSinceYear,CompetitionOpenSinceMonth.

rossmann <-rossmann%>%
  group_by(Store)%>%
  mutate(date_since_promo2 = Date -as.Date(paste(Promo2SinceYear,ifelse(ceiling(Promo2SinceWeek/4)<10,
                                                                        paste0(0,ceiling(Promo2SinceWeek/4)),ceiling(Promo2SinceWeek/4)),
                                                 '01',sep = '-'),format = '%Y-%m-%d'))%>%
  mutate(date_since_promo2 = ifelse(is.na(date_since_promo2)==TRUE,0,date_since_promo2))%>%
  mutate(promo2_month = ifelse(date_since_promo2>0 &  PromoInterval == 'Feb_May_Aug_Nov' & Month %in% c(2,5,8,11),1,
                               ifelse(date_since_promo2>0 & PromoInterval == 'Jan_Apr_Jul_Oct' & Month %in% c(1,4,7,10),1,
                                      ifelse(date_since_promo2>0 & PromoInterval == 'Mar_Jun_Sept_Dec' & Month %in% c(3,6,9,12),1,0))))
# drop some columns we have used
rossmann<-dplyr::select(rossmann, -c(PromoInterval,Promo2SinceYear,Promo2SinceWeek))

#--date since competitor starts
rossmann <-rossmann%>%
  group_by(Store)%>%
  mutate(date_since_comp_op = Date -as.Date(paste(CompetitionOpenSinceYear,ifelse(CompetitionOpenSinceMonth<10,paste0(0,CompetitionOpenSinceMonth),
                                                  CompetitionOpenSinceMonth),'01',sep = '-'),format = '%Y-%m-%d'))

# drop some columns we have used
rossmann<-dplyr::select(rossmann, -c(CompetitionOpenSinceYear,CompetitionOpenSinceMonth))

With expectation to get better results we add new variabels - average Sales and average Number of Customers per quarter, half year and last year.

window <- 6*7
quarter <- 120
half_yr <- 150
last_yr <- 365

rossmann<-rossmann%>%
  group_by(Store)%>%
  mutate(qtr_S=c(rep(NA,quarter-1),
                 rollmean(shift(Sales,window),
                          k= quarter,
                          align = 'left')),
         halfyr_S=c(rep(NA,half_yr-1),
                    rollmean(shift(Sales,window),
                             k= half_yr,
                             align = 'left')),
         yr_S=c(rep(NA,last_yr-1),
                rollmean(shift(Sales,window),
                         k= last_yr,
                         align = 'left')),
         qtr_Cs=c(rep(NA,quarter-1),
                  rollmean(shift(Customers,window),
                           k= quarter,
                           align = 'left')),
         halfyr_Cs=c(rep(NA,half_yr-1),
                     rollmean(shift(Customers,window),
                              k= half_yr,
                              align = 'left')),
         yr_Cs=c(rep(NA,last_yr-1),
                 rollmean(shift(Customers,window),
                          k= last_yr,
                          align = 'left')))

What’s more, it seemed interesting to check, for examle, if the day since summer will be signigficat.

rossmann<-rossmann%>%
  group_by(Store, Year)%>%
  mutate(DayssinceSm = Date -as.Date(paste0(Year,"-06-15"),
                                     "%Y-%m-%d"))%>%
  mutate(DayssinceSm = ifelse(DayssinceSm>0,DayssinceSm,0))

An of course we check missings and remove them.

# Remove missings
sum(is.na(rossmann))
## [1] 2016330
rossmann <- rossmann[complete.cases(rossmann), ]

# Change the type of some variables
rossmann$DayOfWeek <- as.factor(rossmann$DayOfWeek)
rossmann$StoreType <- as.factor(rossmann$StoreType)
rossmann$Assortment <- as.factor(rossmann$Assortment)

rossmann$Store <- as.numeric(rossmann$Store)
rossmann$Sunday_sales <- as.numeric(rossmann$Sunday_sales)
rossmann$Month <- as.numeric(rossmann$Month)
rossmann$Year <- as.numeric(rossmann$Year)
rossmann$Sales <- as.numeric(rossmann$Sales)
rossmann$Customers <- as.numeric(rossmann$Customers)
rossmann$Promo <- as.numeric(rossmann$Promo)
rossmann$date_since_comp_op <- as.numeric(rossmann$date_since_comp_op)

Let’s split data into train and test data sets and and perform exploratory data analysis based on the train dataset.

# split data into train and test datasets
split_date <-max(rossmann$Date) - window
#
train <- rossmann%>%
  group_by(Store)%>%
  filter(Date < split_date)

test <- rossmann%>%         
  group_by(Store)%>%
  filter(Date > split_date)

2. Exploratory Data Analysis

The distribution of Sales are right skewed. It means that mean Sales per store is higher than its median.

hist(train$Sales, 100) # Sales

Have a look into the distribution of Customer.

hist(train$Customers, 100)

hist(train$CompetitionDistance, 100)

Sales is as expected strongly correlated with the number of customers. It looks like the Boxplots of customers overlap a little more than the boxplots of sales. This would mean that the promos are not mainly attracting more customers but make customers spend more. The mean amount spent per customer is about one Euro higher:

There are sometimes promos while the respective store is closed and there are promos 45% of the time:

table(ifelse(train$Sales != 0, "Sales > 0", "Sales = 0"),
      ifelse(train$Promo, "Promo", "No promo"))
##            
##             No promo  Promo
##   Sales = 0    56003   4898
##   Sales > 0   157785 133486

We remove variabels Date, Week, Store, Customers beacause

# Delete not used columns
train$Date <- NULL
train$Store <- NULL
train$Customers <- NULL
train$Week <- NULL
train$missing <- NULL
train$promo2_month <- NULL

test$Date <- NULL
test$Store <- NULL
test$Customers <- NULL
test$Week <- NULL
test$missing <- NULL
test$promo2_month <- NULL

2.2 Correlation

Separate objects including a vector of potential predictors by type.

categorical_vars <- c("DayOfWeek", "StateHoliday", "StoreType", "Assortment")

continuous_vars <- c("Sunday_sales", "Open", "Promo", "SchoolHoliday", 
                     "CompetitionDistance", "Promo2", "Month", "Year", "date_since_promo2",
                     "date_since_comp_op", "qtr_S", "halfyr_S", "yr_S", "qtr_Cs", "halfyr_Cs", "yr_Cs", "DayssinceSm")
                     

variables <- c("DayOfWeek", "StateHoliday", "StoreType", "Assortment",
               "Sunday_sales", "Open", "Promo", "SchoolHoliday", 
               "CompetitionDistance", "Promo2", "Month", "Year", "date_since_promo2",
               "date_since_comp_op", "qtr_S", "halfyr_S", "yr_S", "qtr_Cs", "halfyr_Cs", "yr_Cs",    "DayssinceSm")

depend <- "Sales"

We check the correlation using Pearson method.

library(corrplot)
## corrplot 0.84 loaded
corr = cor(as.data.frame(train)[continuous_vars], use = 'complete.obs', method = 'pearson')
corrplot(corr, method="circle")

corrplot(corr, method="number")

As we can obsserve, there are some variables wich are highly correlated (>0.8). We don’t remove them, cause in the Machine Learning Methods it’s not so significant.

2.3 Feature Selection - Step-wise Regression

To check which variabels are the most important Step-wise Regression is used.

base.mod <- lm(Sales ~ 1 , data= train)  # base intercept only model
all.mod <- lm(Sales ~ . , data= train) # full model with all predictors

Almost all variables are significant (p-value < 0.05). Only StoreType is insignificant (p-value>0.05). Multiple R-squared is the same as Adjusted R-squred = 0.8286. It means that 82.86% of changes in Sales are interpreted by the model.

stepMod <- step(base.mod, scope = list(lower = base.mod, upper = all.mod), direction = "both", trace = 0, steps = 1000)  # perform step-wise algorithm
shortlistedVars <- names(unlist(stepMod[[1]])) # get the shortlisted variable.
shortlistedVars <- shortlistedVars[!shortlistedVars %in% "(Intercept)"]  # remove intercept 
print(shortlistedVars)
##  [1] "Open"                "yr_S"                "Promo"              
##  [4] "DayOfWeek2"          "DayOfWeek3"          "DayOfWeek4"         
##  [7] "DayOfWeek5"          "DayOfWeek6"          "DayOfWeek7"         
## [10] "Month"               "qtr_S"               "Sunday_sales"       
## [13] "StateHolidaya"       "StateHolidayb"       "StateHolidayc"      
## [16] "SchoolHoliday"       "Year"                "halfyr_S"           
## [19] "halfyr_Cs"           "qtr_Cs"              "yr_Cs"              
## [22] "DayssinceSm"         "Assortmentb"         "Assortmentc"        
## [25] "Promo2"              "date_since_promo2"   "StoreTypeb"         
## [28] "StoreTypec"          "StoreTyped"          "CompetitionDistance"
## [31] "date_since_comp_op"

It looks, like variables “Open”, “yr_S”, “Promo”, “DayOfWeek2”, “DayOfWeek3” are the most important variables.

3. Machine Learning Methods

3.1 Linear Regression

source('regressionMetrics.R')

model1 <- lm(Sales ~ .,
             data = train)
summary(model1)
## 
## Call:
## lm(formula = Sales ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13365.7   -936.1   -130.3    785.3  24886.0 
## 
## Coefficients:
##                       Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)         -2.969e+05  1.426e+04  -20.830  < 2e-16 ***
## Sunday_sales        -7.167e+02  2.396e+01  -29.906  < 2e-16 ***
## DayOfWeek2          -1.027e+03  1.026e+01 -100.047  < 2e-16 ***
## DayOfWeek3          -1.463e+03  1.039e+01 -140.853  < 2e-16 ***
## DayOfWeek4          -1.404e+03  1.037e+01 -135.350  < 2e-16 ***
## DayOfWeek5          -1.098e+03  1.034e+01 -106.274  < 2e-16 ***
## DayOfWeek6          -1.095e+03  1.110e+01  -98.602  < 2e-16 ***
## DayOfWeek7          -3.409e+02  3.919e+01   -8.698  < 2e-16 ***
## Open                 6.601e+03  3.858e+01  171.128  < 2e-16 ***
## Promo                2.213e+03  6.523e+00  339.334  < 2e-16 ***
## StateHolidaya       -3.040e+02  4.191e+01   -7.253 4.07e-13 ***
## StateHolidayb       -1.081e+03  4.843e+01  -22.313  < 2e-16 ***
## StateHolidayc        2.178e+02  6.021e+01    3.617 0.000298 ***
## SchoolHoliday        2.211e+02  7.903e+00   27.973  < 2e-16 ***
## StoreTypeb          -1.538e+02  3.989e+01   -3.856 0.000115 ***
## StoreTypec          -5.853e+00  8.164e+00   -0.717 0.473447    
## StoreTyped           3.542e+00  7.354e+00    0.482 0.630088    
## Assortmentb          4.585e+02  5.570e+01    8.232  < 2e-16 ***
## Assortmentc          2.355e+01  6.173e+00    3.816 0.000136 ***
## CompetitionDistance -1.059e-03  3.878e-04   -2.731 0.006308 ** 
## Promo2               8.775e+01  8.538e+00   10.277  < 2e-16 ***
## Month                5.712e+01  2.312e+00   24.709  < 2e-16 ***
## Year                 1.446e+02  7.075e+00   20.434  < 2e-16 ***
## date_since_promo2   -5.734e-02  6.689e-03   -8.573  < 2e-16 ***
## date_since_comp_op   3.250e-03  1.306e-03    2.489 0.012814 *  
## qtr_S               -8.201e-01  4.905e-02  -16.719  < 2e-16 ***
## halfyr_S             2.047e+00  5.902e-02   34.680  < 2e-16 ***
## yr_S                -2.137e-01  2.687e-02   -7.951 1.85e-15 ***
## qtr_Cs               1.402e+01  5.169e-01   27.126  < 2e-16 ***
## halfyr_Cs           -1.905e+01  6.044e-01  -31.521  < 2e-16 ***
## yr_Cs                4.788e+00  2.359e-01   20.298  < 2e-16 ***
## DayssinceSm          1.193e+00  1.164e-01   10.251  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1617 on 352140 degrees of freedom
## Multiple R-squared:  0.8286, Adjusted R-squared:  0.8286 
## F-statistic: 5.492e+04 on 31 and 352140 DF,  p-value: < 2.2e-16
model1.pred <- predict(model1, new=test)
summary(model1.pred)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -3432    4664    6462    6178    8159   23877

Almost all variables are significant (p-value < 0.05). Only StoreType is insignificant (p-value>0.05). Multiple R-squared is the same as Adjusted R-squred = 0.8286. It means that 82.86% of changes in Sales are interpreted by the model.

# test dataset
LNModel <- regressionMetrics(real = test$Sales,
                             predicted = model1.pred)
LNModel
##       MSE     RMSE      MAE    MedAE        R2
## 1 2155991 1468.329 1062.919 820.1402 0.8460609

84.60 % of variation in Sales is described by LNModel. Root mean square deviation is equal to 1468.329.

3.2 Linear Regression without insignificant variable “StoreType”

model1A <- lm(Sales ~ Sunday_sales + DayOfWeek + Open + Promo + StateHoliday +
                Assortment + CompetitionDistance + Promo2 + Month + Year + date_since_promo2 +
                date_since_comp_op + qtr_S + halfyr_S + yr_S + qtr_Cs + halfyr_Cs + 
                yr_Cs + DayssinceSm,
              data = train)
summary(model1A)
## 
## Call:
## lm(formula = Sales ~ Sunday_sales + DayOfWeek + Open + Promo + 
##     StateHoliday + Assortment + CompetitionDistance + Promo2 + 
##     Month + Year + date_since_promo2 + date_since_comp_op + qtr_S + 
##     halfyr_S + yr_S + qtr_Cs + halfyr_Cs + yr_Cs + DayssinceSm, 
##     data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13390.6   -937.9   -134.0    781.9  24855.6 
## 
## Coefficients:
##                       Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)         -2.880e+05  1.426e+04  -20.192  < 2e-16 ***
## Sunday_sales        -7.650e+02  1.997e+01  -38.314  < 2e-16 ***
## DayOfWeek2          -1.019e+03  1.027e+01  -99.205  < 2e-16 ***
## DayOfWeek3          -1.456e+03  1.040e+01 -140.040  < 2e-16 ***
## DayOfWeek4          -1.399e+03  1.038e+01 -134.692  < 2e-16 ***
## DayOfWeek5          -1.094e+03  1.035e+01 -105.754  < 2e-16 ***
## DayOfWeek6          -1.138e+03  1.100e+01 -103.443  < 2e-16 ***
## DayOfWeek7          -4.043e+02  3.911e+01  -10.338  < 2e-16 ***
## Open                 6.582e+03  3.854e+01  170.775  < 2e-16 ***
## Promo                2.204e+03  6.521e+00  337.943  < 2e-16 ***
## StateHolidaya       -3.267e+02  4.189e+01   -7.799 6.25e-15 ***
## StateHolidayb       -9.443e+02  4.815e+01  -19.611  < 2e-16 ***
## StateHolidayc        3.600e+02  5.998e+01    6.001 1.97e-09 ***
## Assortmentb          3.996e+02  5.165e+01    7.737 1.02e-14 ***
## Assortmentc          2.672e+01  6.017e+00    4.441 8.96e-06 ***
## CompetitionDistance -8.349e-04  3.846e-04   -2.171   0.0299 *  
## Promo2               8.903e+01  8.545e+00   10.419  < 2e-16 ***
## Month                6.312e+01  2.304e+00   27.398  < 2e-16 ***
## Year                 1.402e+02  7.080e+00   19.799  < 2e-16 ***
## date_since_promo2   -5.815e-02  6.690e-03   -8.693  < 2e-16 ***
## date_since_comp_op   3.151e-03  1.306e-03    2.413   0.0158 *  
## qtr_S               -7.246e-01  4.898e-02  -14.792  < 2e-16 ***
## halfyr_S             1.895e+00  5.883e-02   32.211  < 2e-16 ***
## yr_S                -1.551e-01  2.682e-02   -5.783 7.34e-09 ***
## qtr_Cs               1.345e+01  5.170e-01   26.006  < 2e-16 ***
## halfyr_Cs           -1.817e+01  6.041e-01  -30.077  < 2e-16 ***
## yr_Cs                4.456e+00  2.357e-01   18.908  < 2e-16 ***
## DayssinceSm          9.728e-01  1.162e-01    8.371  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1619 on 352144 degrees of freedom
## Multiple R-squared:  0.8282, Adjusted R-squared:  0.8282 
## F-statistic: 6.289e+04 on 27 and 352144 DF,  p-value: < 2.2e-16

Exculed variable StoryType decreases the R2 from 82.86 % to 82.82% on train dataset. The change is little, but also worthed to be annouced. In model1A all variables are significant (p-value <0.05). Let’s make the prediction on test data set.

model1A.pred <- predict(model1A, new=test)
LNModelA <- regressionMetrics(real = test$Sales,
                              predicted = model1A.pred)
LNModelA# 84.73 %
##       MSE     RMSE      MAE    MedAE        R2
## 1 2137778 1462.114 1050.992 805.5585 0.8473613

After removing insignificant variable our result is a little bit improved comparing to model1. 84.73 % of changes in Sales are predicted by the model.

3.3 Linear Regression including top 5 variables based on Step-wise Regression

Investigating futher, we check how good is model with only 5 most significant variabels after Step-wise regression.

model1B <- lm(Sales ~ Open + yr_S + Promo + DayOfWeek,
              data = train)
summary(model1B)
## 
## Call:
## lm(formula = Sales ~ Open + yr_S + Promo + DayOfWeek, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14030.8   -956.3   -139.0    795.0  25110.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.406e+03  1.765e+01 -306.30   <2e-16 ***
## Open         6.812e+03  1.492e+01  456.64   <2e-16 ***
## yr_S         9.744e-01  1.334e-03  730.38   <2e-16 ***
## Promo        2.212e+03  6.609e+00  334.65   <2e-16 ***
## DayOfWeek2  -9.944e+02  1.043e+01  -95.38   <2e-16 ***
## DayOfWeek3  -1.434e+03  1.043e+01 -137.49   <2e-16 ***
## DayOfWeek4  -1.391e+03  1.039e+01 -133.78   <2e-16 ***
## DayOfWeek5  -1.102e+03  1.043e+01 -105.67   <2e-16 ***
## DayOfWeek6  -1.127e+03  1.110e+01 -101.49   <2e-16 ***
## DayOfWeek7  -1.868e+02  1.741e+01  -10.72   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1649 on 352162 degrees of freedom
## Multiple R-squared:  0.8219, Adjusted R-squared:  0.8219 
## F-statistic: 1.806e+05 on 9 and 352162 DF,  p-value: < 2.2e-16

The results for model1B on train data set are the worst comparing to two previouse models. R2 is equal to 82.19 %.

model1B.pred <- predict(model1B, new=test)
# test dataset
LNModelB <- regressionMetrics(real = test$Sales,
                              predicted = model1B.pred) # 84.75 %
LNModelB
##       MSE     RMSE      MAE    MedAE        R2
## 1 2134447 1460.975 1039.168 783.5543 0.8475992

What’s interesting, that R2 on train data set is lower but on test data set is bigger than in previouse two models, but still difference isn’t too big. 84.75 % of changes in Sales are predicted with a model1B. RMSE is smaller than in model1 and model1A based on test data set.

3.4 Regression Tree

## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble  1.4.2     ✔ stringr 1.3.1
## ✔ purrr   0.2.5     ✔ forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## Warning: package 'forcats' was built under R version 3.4.3
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ data.table::between()    masks dplyr::between()
## ✖ lubridate::date()        masks base::date()
## ✖ dplyr::filter()          masks timeSeries::filter(), stats::filter()
## ✖ data.table::first()      masks dplyr::first(), xts::first()
## ✖ data.table::hour()       masks lubridate::hour()
## ✖ lubridate::intersect()   masks base::intersect()
## ✖ data.table::isoweek()    masks lubridate::isoweek()
## ✖ dplyr::lag()             masks timeSeries::lag(), stats::lag()
## ✖ data.table::last()       masks dplyr::last(), xts::last()
## ✖ data.table::mday()       masks lubridate::mday()
## ✖ data.table::minute()     masks lubridate::minute()
## ✖ data.table::month()      masks lubridate::month()
## ✖ .GlobalEnv::quarter()    masks data.table::quarter(), lubridate::quarter()
## ✖ data.table::second()     masks lubridate::second()
## ✖ lubridate::setdiff()     masks base::setdiff()
## ✖ purrr::transpose()       masks data.table::transpose()
## ✖ lubridate::union()       masks base::union()
## ✖ data.table::wday()       masks lubridate::wday()
## ✖ data.table::week()       masks lubridate::week()
## ✖ data.table::yday()       masks lubridate::yday()
## ✖ data.table::year()       masks lubridate::year()
## Warning: package 'MASS' was built under R version 3.4.4
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Warning: package 'tree' was built under R version 3.4.4
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.4.4
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
## Warning: package 'rpart' was built under R version 3.4.3
## Warning: package 'rpart.plot' was built under R version 3.4.4
## Warning: package 'rattle' was built under R version 3.4.4
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

Let’s run regression tree on all predictors on train data set.

model2 <- Sales ~ . 
Rossmann.tree <- tree(model2 ,train)
summary(Rossmann.tree)
## 
## Regression tree:
## tree(formula = model2, data = train)
## Variables actually used in tree construction:
## [1] "Open"     "qtr_S"    "Promo"    "halfyr_S" "yr_S"    
## Number of terminal nodes:  8 
## Residual mean deviance:  2800000 = 9.861e+11 / 352200 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -11000.0   -861.1      0.0      0.0    576.8  27240.0

Only 5 variables are seleted in tree construction: Open, qtr_S, Promo, halfyr_S, yr_S

Let’s visualise the tree on the plot.

plot(Rossmann.tree)
text(Rossmann.tree, pretty = 0)

Number of terminal nodes: 8. Variables Open, Promo and yr_S are significant both Regression Tree and Step-wise Regression.

We apply cross validation with 10 folds to access appropriate tree size.

Rossmann.cv <- cv.tree(Rossmann.tree, K = 10)
plot(Rossmann.cv$size, Rossmann.cv$dev, type = 'b') 

WE assume that the number of that the appropriate number of final nodes is 5 (terminal noods) and then prunned tree on the plot.

Rossmann.prune <- prune.tree(Rossmann.tree, best = 5)
plot(Rossmann.prune)
text(Rossmann.prune, pretty = 0)

And finally we can build and visualise the prediction.

model2.pred <- predict(Rossmann.tree, test)

# visualising prediction 
Rossmann.test1 <- test$Sales
plot(model2.pred, Rossmann.test1)
abline(0, 1)

# mean square of prediction error
RegressTree <- regressionMetrics(real = test$Sales,
                                 predicted = model2.pred)
RegressTree
##       MSE     RMSE      MAE    MedAE        R2
## 1 2393061 1546.952 1015.954 712.9639 0.8291339

The prediction is worther than in case of Linear Prediction. R2 is equal to 82.91 % and RMSE is 1546.952.

3.5 Regression Tree - including top 5 variables based on Step-wise Regression

model2B <- Sales ~ Open + yr_S + Promo + DayOfWeek 
Rossmann.treeB <- tree(model2B ,train)

Rossmann.treeB
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 352172 5.374e+12  5844  
##    2) Open < 0.5 60877 0.000e+00     0 *
##    3) Open > 0.5 291295 2.860e+12  7065  
##      6) yr_S < 6516.19 207355 8.767e+11  5895  
##       12) Promo < 0.5 112198 2.863e+11  4937  
##         24) yr_S < 4625.83 48395 6.152e+10  3903 *
##         25) yr_S > 4625.83 63803 1.338e+11  5721 *
##       13) Promo > 0.5 95157 3.660e+11  7025  
##         26) yr_S < 4804.01 46163 9.400e+10  5825 *
##         27) yr_S > 4804.01 48994 1.431e+11  8154 *
##      7) yr_S > 6516.19 83940 9.978e+11  9957  
##       14) yr_S < 10099.9 72638 5.075e+11  9222  
##         28) Promo < 0.5 39382 1.671e+11  7889 *
##         29) Promo > 0.5 33256 1.875e+11 10800 *
##       15) yr_S > 10099.9 11302 1.987e+11 14680 *
summary(Rossmann.treeB)
## 
## Regression tree:
## tree(formula = model2B, data = train)
## Variables actually used in tree construction:
## [1] "Open"  "yr_S"  "Promo"
## Number of terminal nodes:  8 
## Residual mean deviance:  2799000 = 9.858e+11 / 352200 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -11000.0   -862.7      0.0      0.0    578.6  27240.0
plot(Rossmann.treeB)
text(Rossmann.treeB, pretty = 0)

It looks like Open, yr_S, Promo are the most significant variabels.

As previously we apply cross-validation with 10 folds to access appropriate tree size.

Rossmann.cvB <- cv.tree(Rossmann.treeB, K = 10)
plot(Rossmann.cvB$size, Rossmann.cv$devB, type = 'b') 

And assume that the appropriate number of final nodes is 5 (terminal noods).

Rossmann.pruneB <- prune.tree(Rossmann.treeB, best = 5)

Let’s prunne tree on the plot and finally build the prediction

model2B.pred <- predict(Rossmann.treeB, test)

# visualising prediction 
Rossmann.test1 <- test$Sales
plot(model2B.pred, Rossmann.test1)
abline(0, 1)

# mean square of prediction error
RegressTreeB <- regressionMetrics(real = test$Sales,
                                  predicted = model2B.pred)
RegressTreeB
##       MSE     RMSE      MAE MedAE       R2
## 1 2420245 1555.714 1029.322   728 0.827193

R2 is 82.71 % and RMSE is equal to 1555.714. These results are better comparing with Regression tree with all varaibeles and still worther than in simple Regression.

3.6 XGBOOST

We feet the model based on training data set.

library(xgboost)
## Warning: package 'xgboost' was built under R version 3.4.4
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:rattle':
## 
##     xgboost
## The following object is masked from 'package:dplyr':
## 
##     slice
set.seed(123324)
training <- F
if (training){
  model3 <- train(
    Sales ~., data = train, method = "xgbTree",
    trControl = trainControl("cv", number = 10)
  )
  
  model3$bestTune
  
  # And than let's make prediction on the test data set.
  
  predictions <- model3 %>% predict(test)
  head(predictions)
  saveRDS(predicitons, file ='predictions.Rds')
  XGBoost <- regressionMetrics(real = test$Sales,
                                    predicted = predictions) 
  XGBoost
}

3.7 Penalized Regression Essentials: Ridge, Lasso

In the purpose of father analyses we need to create two objects:

y for storing the outcome variable x for holding the predictor variables

library(tidyverse)
library(caret)
#install.packages("glmnet")
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.4.4
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 3.4.4
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.3
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded glmnet 2.0-16
# Predictor variables
x <- model.matrix(Sales~., train)[,-1]
# Outcome variable
y <- train$Sales

3.8 Ridge Regression

In penalized regression, you have to specify a constant lambda to adjust the amount of the coefficient shrinkage. The best lambda for the data is defined as the lambda that minimize the cross-validation prediction error rate. This is determined automatically using the function cv.glmnet().

Alpha is equal to 0, because this value is defined for Ridge Regression.

# Find the best lambda using cross-validation
set.seed(123) 
cv <- cv.glmnet(x, y, alpha = 0)
cv$lambda.min
## [1] 293.2092

In our case the minimum lambda is 293.2092.

# Fit the final model on the training data
model5 <- glmnet(x, y, alpha = 0, lambda = cv$lambda.min)
# Display regression coefficients
coef(model5)
## 32 x 1 sparse Matrix of class "dgCMatrix"
##                                s0
## (Intercept)         -2.819662e+05
## Sunday_sales        -4.385642e+02
## DayOfWeek2          -5.599388e+02
## DayOfWeek3          -9.824149e+02
## DayOfWeek4          -9.500882e+02
## DayOfWeek5          -6.663020e+02
## DayOfWeek6          -7.028513e+02
## DayOfWeek7          -2.457056e+03
## Open                 3.917230e+03
## Promo                2.101716e+03
## StateHolidaya       -2.618597e+03
## StateHolidayb       -3.248947e+03
## StateHolidayc       -2.309552e+03
## SchoolHoliday        2.023427e+02
## StoreTypeb          -1.877237e+02
## StoreTypec          -2.463012e+01
## StoreTyped           9.874562e+01
## Assortmentb         -1.747349e+02
## Assortmentc          8.238457e+01
## CompetitionDistance  1.402994e-03
## Promo2               7.312756e+01
## Month                5.377921e+01
## Year                 1.383517e+02
## date_since_promo2   -2.334723e-02
## date_since_comp_op   4.994841e-03
## qtr_S                3.026933e-01
## halfyr_S             2.944394e-01
## yr_S                 2.968929e-01
## qtr_Cs               2.287218e-01
## halfyr_Cs            1.555869e-01
## yr_Cs                2.056544e-01
## DayssinceSm          1.007761e+00

Let’s make predictions on the test data

x.test <- model.matrix(Sales ~., test)[,-1]
model5.pred <- model5 %>% predict(x.test) %>% as.vector()

RidgeRegress_1 <- regressionMetrics(real = test$Sales,
                                  predicted = model5.pred)
RidgeRegress_1
##       MSE     RMSE      MAE   MedAE        R2
## 1 2232567 1494.178 1060.956 799.758 0.8405933

R2 is equal 84.06% and RMSE is 1494.178 - these results are quite similar to the results from linear regression.

3.9 Lasso Regression

The only difference between the R code used for ridge regression is that, for lasso regression you need to specify the argument alpha = 1 instead of alpha = 0 (for ridge regression). As in previouse case, we find lambda using cross-validation.

set.seed(12345) 
cv <- cv.glmnet(x, y, alpha = 1)
cv$lambda.min
## [1] 2.734479

As you can see, the best lambda is eqault to 2.734479.

Let’s fit the final model on the training data and display regression coefficients.

model6 <- glmnet(x, y, alpha = 1, lambda = cv$lambda.min)
coef(model6)
## 32 x 1 sparse Matrix of class "dgCMatrix"
##                                s0
## (Intercept)         -3.128304e+05
## Sunday_sales        -7.173064e+02
## DayOfWeek2          -9.734598e+02
## DayOfWeek3          -1.421121e+03
## DayOfWeek4          -1.367403e+03
## DayOfWeek5          -1.061747e+03
## DayOfWeek6          -1.064592e+03
## DayOfWeek7          -2.204128e+02
## Open                 6.694886e+03
## Promo                2.208735e+03
## StateHolidaya       -1.700022e+02
## StateHolidayb       -9.436744e+02
## StateHolidayc        3.076658e+02
## SchoolHoliday        1.967546e+02
## StoreTypeb          -1.546746e+02
## StoreTypec           .           
## StoreTyped           1.212009e+01
## Assortmentb          2.954582e+02
## Assortmentc          2.449938e+01
## CompetitionDistance -5.785099e-05
## Promo2               7.412429e+01
## Month                7.459945e+01
## Year                 1.523642e+02
## date_since_promo2   -3.845303e-02
## date_since_comp_op   2.334481e-03
## qtr_S                5.001842e-01
## halfyr_S             2.448082e-01
## yr_S                 2.597846e-01
## qtr_Cs               .           
## halfyr_Cs           -1.590857e-01
## yr_Cs                .           
## DayssinceSm          2.255933e-01

Make predictions on the test data and check results.

x.test2 <- model.matrix(Sales ~., test)[,-1]
model6.pred <- model6 %>% predict(x.test2) %>% as.vector()

LassoRegress_1 <- regressionMetrics(real = test$Sales,
                                  predicted = model6.pred)
LassoRegress_1
##       MSE     RMSE      MAE    MedAE        R2
## 1 2164143 1471.103 1064.469 823.8259 0.8454788

R2 is 84.54 % and RMSE is equal to 1471.103. The results are better than in simple linear regression and regression tree. So let’s investigate futher.

In futher investigation we are interested to compute and compare ridge and lasso regression using the caret workflow. Caret will automatically choose the best tuning parameter values, compute the final model and evaluate the model performance using cross-validation techniques.

3.10 Ridge Regression using Caret package

We setup a grid range of lambda values and compute ridge regression.

lambda <- 10^seq(-3, 3, length = 100)
set.seed(1235)
ridge <- train(
  Sales ~., data = train, method = "glmnet",
  trControl = trainControl("cv", number = 10),
  tuneGrid = expand.grid(alpha = 0, lambda = lambda)
)
coef(ridge$finalModel, ridge$bestTune$lambda)
## 32 x 1 sparse Matrix of class "dgCMatrix"
##                                 1
## (Intercept)         -2.847309e+05
## Sunday_sales        -4.403558e+02
## DayOfWeek2          -5.925777e+02
## DayOfWeek3          -1.017446e+03
## DayOfWeek4          -9.829249e+02
## DayOfWeek5          -6.975017e+02
## DayOfWeek6          -7.317390e+02
## DayOfWeek7          -2.450214e+03
## Open                 3.968343e+03
## Promo                2.110474e+03
## StateHolidaya       -2.593819e+03
## StateHolidayb       -3.237227e+03
## StateHolidayc       -2.272014e+03
## SchoolHoliday        2.020036e+02
## StoreTypeb          -1.722348e+02
## StoreTypec          -2.285709e+01
## StoreTyped           9.012787e+01
## Assortmentb         -1.138023e+02
## Assortmentc          7.748702e+01
## CompetitionDistance  1.187428e-03
## Promo2               7.357451e+01
## Month                5.488974e+01
## Year                 1.397040e+02
## date_since_promo2   -2.582179e-02
## date_since_comp_op   4.897321e-03
## qtr_S                3.105107e-01
## halfyr_S             3.001296e-01
## yr_S                 2.953491e-01
## qtr_Cs               1.872586e-01
## halfyr_Cs            1.083612e-01
## yr_Cs                2.116699e-01
## DayssinceSm          9.795607e-01

Finally, we make prediciton and check the results.

model8 <- ridge %>% predict(test)
RidgeRegress_2 <- regressionMetrics(real = test$Sales,
                                     predicted = model8)
RidgeRegress_2
##       MSE     RMSE      MAE    MedAE        R2
## 1 2224438 1491.455 1060.239 802.0917 0.8411737

R2 is 84.11 %, what is better than in case of Ridge Regression using cross-validation method but worther than LASSO Regression (84.54%). In Ridge REgression using caret package RMSE is equal to 1491.455 and lower than in Ridge Regression using cross-validation (1494.178) but is higher than in Lasso Regression using cross-validation.

3.11 Lasso Regression using Caret package

set.seed(123)
lasso <- train(
  Sales ~., data = train, method = "glmnet",
  trControl = trainControl("cv", number = 10),
  tuneGrid = expand.grid(alpha = 1, lambda = lambda)
)
coef(lasso$finalModel, lasso$bestTune$lambda)
## 32 x 1 sparse Matrix of class "dgCMatrix"
##                                 1
## (Intercept)         -3.204719e+05
## Sunday_sales        -7.308101e+02
## DayOfWeek2          -9.723828e+02
## DayOfWeek3          -1.416511e+03
## DayOfWeek4          -1.364407e+03
## DayOfWeek5          -1.058877e+03
## DayOfWeek6          -1.061828e+03
## DayOfWeek7          -1.626491e+02
## Open                 6.750595e+03
## Promo                2.209169e+03
## StateHolidaya       -1.141447e+02
## StateHolidayb       -8.892639e+02
## StateHolidayc        3.737920e+02
## SchoolHoliday        1.950170e+02
## StoreTypeb          -1.529149e+02
## StoreTypec           .           
## StoreTyped           1.485219e+01
## Assortmentb          2.972015e+02
## Assortmentc          2.988212e+01
## CompetitionDistance -3.772743e-06
## Promo2               7.540055e+01
## Month                7.550407e+01
## Year                 1.561285e+02
## date_since_promo2   -3.946825e-02
## date_since_comp_op   2.736665e-03
## qtr_S                5.337721e-01
## halfyr_S             1.511060e-01
## yr_S                 3.178800e-01
## qtr_Cs               .           
## halfyr_Cs           -1.433309e-01
## yr_Cs                .           
## DayssinceSm          1.334948e-01

Let’s make prediction and check the results.

model9 <- lasso %>% predict(test)
LassoRegress_2 <- regressionMetrics(real = test$Sales,
                                     predicted = model9)
LassoRegress_2
##       MSE     RMSE      MAE    MedAE        R2
## 1 2166087 1471.763 1066.168 824.8719 0.8453401

The results are better than we have in Ridge and Lasso Regressio using cross-validation technik. R2 is 84.53% and RMSE is 1471.763.

4. ARIMA

# only select few variales for our analysi
rossmann1 <- rossmann1%>%
  dplyr::select(Store,Date,DayOfWeek,Month,Year,Sales,Customers)%>%
  arrange(Store,Date)
# add number of observation as a type of store to indicate which the store
# which has misisng data
rossmann1 <-rossmann1%>%
  group_by(Store)%>%
  summarise(days_count = n())%>%
  ungroup()%>%
  mutate(missing = ifelse(days_count==758,1,0))%>%
  dplyr::select(-days_count)%>%
  left_join(rossmann1, by = 'Store')
 # let add this atribute to the data set
rossmann1 <-rossmann1%>%
  filter(DayOfWeek==7)%>%
  group_by(Store)%>%
  summarise(Sales = sum(Sales))%>%
  arrange(Sales)%>%
  ungroup()%>%
  mutate(Sunday_sales = ifelse(Sales >0,1,0))%>%
  dplyr::select(-Sales)%>%
  left_join(rossmann1, by = 'Store')%>%
  as.data.frame()
# number of stores by missing data and no sunday sales group  
rossmann1%>%
  dplyr::select(missing,Sunday_sales, Store, Sales)%>%
  mutate(Sales = Sales/1000)%>% #avoid integer overflow
  group_by(Store,missing,Sunday_sales)%>%
  summarise(Sales = sum(Sales))%>%
  group_by(Sunday_sales,missing)%>%
  summarise(store_count = n(),
            Sales = sum(Sales))%>%
  ungroup()%>%
  mutate(sales_pct = Sales/sum(Sales))
## # A tibble: 4 x 5
##   Sunday_sales missing store_count    Sales sales_pct
##          <dbl>   <dbl>       <int>    <dbl>     <dbl>
## 1            0       0         903 4884698.  0.832   
## 2            0       1         179  738256.  0.126   
## 3            1       0          32  246121.  0.0419  
## 4            1       1           1    4106.  0.000699

=> Its worth having two ARIMA models for Sunday sales and non-sunday sales. In addition, we need to take the missing data issue into consideration. For ARIMA model, we only use the last 6 months of the missing-data stores to estimate.

1.2 Seasonality investigation

Investigate the weekly seasonality of the data

rossmann1%>%
  filter(missing==0, Month %in% c(2,3,4))%>%
  group_by(Date,Year,Sunday_sales)%>%
  summarise(Sales = sum(Sales))%>%
  ungroup()%>%
  mutate(doy = yday(Date))%>%
  ggplot(aes(x=doy, y= Sales))+geom_line()+
  facet_wrap(Sunday_sales~Year,scales="free_y")+
  ggtitle(label = 'seasonality of Feb,Mar,Apr pattern over three years Is_Sunday_Sales vs Year')

  • There is a small cycle every week and more edequate cycle after two weeks (interesting)
  • The pattern of Sunday-Sales store and Non-Sunday-Sales store are similar except for Sunday

Investigate the Monthly seasonality of the data (Weekly interval)

rossmann1%>%
  filter(missing==0)%>%
  mutate(Week = week(Date))%>%
  group_by(Year,Week,Sunday_sales)%>%
  summarise(Sales = sum(Sales))%>%
  ggplot(aes(x=Week, y= Sales))+geom_line()+
  facet_wrap(Sunday_sales~Year,scales="free_y")+
  ggtitle(label = 'seasonality pattern over three years Is_Sunday_Sales vs Year')

  • Weekly interval sales confirm the above pattern (2 weeks cycle)
  • Not significant difference between Sunday and Non-Sunday sales stores

Investigate the Monthly seasonality of the data (monthly interval)

# investigate the monthly seasonality of the data
rossmann1%>%
  filter(missing==0)%>%
  group_by(Year, Month)%>%
  summarise(Sales = sum(Sales))%>%
  ggplot(aes(x=Month, y= Sales))+geom_line()+
  facet_free(Year~.)

There are few peaks:
+ March: ? + June: Summer starts + December: Xmas

2. Data Partition

Split the data into train and test set

window <- 6*7
split_date <-max(rossmann1$Date) - window
#
train1 <- rossmann1%>%
  group_by(Store)%>%
  filter(Date <=split_date)

test1 <- rossmann1%>%
  group_by(Store)%>%
  filter(Date > split_date)

# again split the train set into sunday sales group and non-sunday sales group
train1_nosun <- train1%>%
  filter(missing==0, Sunday_sales==0, Sales >0)
train1_nosun_agg <- train1_nosun%>%
  group_by(Date,Year, Month)%>%
  summarise(Sales = mean(Sales))

train1_sun <- train1%>%
  filter(missing==0, Sunday_sales==1, Sales >0)
train1_sun_agg <- train1_nosun%>%
  group_by(Date,Year, Month)%>%
  summarise(Sales = mean(Sales))

3. Model estimation and prediction

3.2 Test for stationarity

acf(diff(train1_nosun_agg$Sales,12) ,lag.max =150, ylim = c(-1,1), lwd = 5,
    col = "dark green",na.action = na.pass, main = 'ACF - 12 days (2 weeks) seasonal adjusted sales')  

pacf(diff(train1_nosun_agg$Sales,12),lag.max = 150,lwd = 5, 
     col = "dark green",na.action = na.pass, main ='PACF - 12 days (2 weeks) seasonal adjusted sales')

plot(diff(train1_nosun_agg$Sales,12),
     type = 'l',
     main = 'Plot of 12 day seasonal adjusted Sales')

Its hard to say that the time series with 12 lags difference is stationary hopefully some additional lag season added can help to solve this problem Possible model: ARMA model with up to 12 lags + the first difference of the seasonal factor and its first few seasonal lags

ADF Test

source('custom_functions.R')
adf_test <-testdf(variable = diff(train1_nosun_agg$Sales,12), 
                  max.augmentations = 12,max.order = 8,
                  plot_title='Plot of the diff(Sales_t, Sales_T-12)')

adf_test_rename(adf_test)
##    agu p_adf p_bg.p1 p_bg.p2 p_bg.p3 p_bg.p4 p_bg.p5 p_bg.p6 p_bg.p7
## 1    0  0.01   0.056   0.000   0.000   0.000   0.000   0.000   0.000
## 2    1  0.01   0.896   0.988   0.924   0.029   0.011   0.005   0.002
## 3    2  0.01   0.962   0.928   0.984   0.022   0.011   0.006   0.002
## 4    3  0.01   0.801   0.969   0.918   0.305   0.075   0.027   0.006
## 5    4  0.01   0.721   0.937   0.985   0.980   0.973   0.903   0.160
## 6    5  0.01   0.911   0.963   0.995   0.999   0.999   1.000   0.652
## 7    6  0.01   0.971   0.999   0.998   1.000   0.949   0.960   0.929
## 8    7  0.01   0.977   0.998   1.000   0.994   0.945   0.959   0.853
## 9    8  0.01   0.921   0.993   0.998   0.977   0.899   0.927   0.862
## 10   9  0.01   0.928   0.577   0.728   0.860   0.875   0.858   0.812
## 11  10  0.01   0.764   0.561   0.678   0.822   0.853   0.864   0.812
## 12  11  0.01   0.580   0.175   0.322   0.470   0.615   0.700   0.798
## 13  12  0.01   0.789   0.192   0.347   0.499   0.640   0.726   0.821
##    p_bg.p8
## 1    0.000
## 2    0.001
## 3    0.001
## 4    0.007
## 5    0.205
## 6    0.741
## 7    0.950
## 8    0.792
## 9    0.697
## 10   0.755
## 11   0.754
## 12   0.868
## 13   0.888

H0: the time series is NON-stationary => Reject the null hypothesis that diff(Sales_t, Sales_T-12) is non-stationary with 4 augmentation

Phillips-Perron Unit Root Test

pp_test <- ur.pp(diff(train1_nosun_agg$Sales,12),type = c("Z-tau"),model = c("trend"))
summary(pp_test)
## 
## ################################## 
## # Phillips-Perron Unit Root Test # 
## ################################## 
## 
## Test regression with intercept and trend 
## 
## 
## Call:
## lm(formula = y ~ y.l1 + trend)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9294.2  -416.3     6.8   533.7  5513.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.17125   55.72428   0.093    0.926    
## y.l1         0.43116    0.03320  12.986   <2e-16 ***
## trend       -0.04698    0.26050  -0.180    0.857    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1517 on 738 degrees of freedom
## Multiple R-squared:  0.1861, Adjusted R-squared:  0.1838 
## F-statistic: 84.35 on 2 and 738 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic, type: Z-tau  is: -17.6889 
## 
##            aux. Z statistics
## Z-tau-mu              0.0779
## Z-tau-beta           -0.1816
## 
## Critical values for Z statistics: 
##                      1pct      5pct     10pct
## critical values -3.975159 -3.418083 -3.131177

Ho: time series is Non-stationary # tau = -19.489 < -3.438846 (1% critical value) =>reject the null about non-stationarity of diff(Sales_t, Sales_T-12)

KPSS Test

kpss <- ur.kpss(diff(train_nosun_agg$Sales,12),type = c("mu")) # constant deterministic component
## Error in diff(train_nosun_agg$Sales, 12): object 'train_nosun_agg' not found
summary(kpss)
## Error in summary(kpss): object 'kpss' not found

H0: time series is stationary test statistic = 0.0109 < 0.463 (5% critical valye) => we cant reject the null about STATIONARITY of diff(Sales_t, Sales_T-12)

Model’s Form

We configure the model mannually and come up with the best model

arima_best <- Arima(train1_nosun_agg$Sales,order = c(14,0,0),
                  seasonal=list(order=c(3,1,1),period=12),
                  include.constant = FALSE)

summary(arima_best)
## Series: train1_nosun_agg$Sales 
## ARIMA(14,0,0)(3,1,1)[12] 
## 
## Coefficients:
##          ar1     ar2      ar3     ar4      ar5      ar6      ar7     ar8
##       0.3619  0.1967  -0.0045  0.1051  -0.1896  -0.0456  -0.1096  0.0328
## s.e.  0.0364  0.0387   0.0390  0.0406   0.0401   0.0411   0.0404  0.0395
##          ar9     ar10    ar11     ar12     ar13    ar14    sar1    sar2
##       0.1165  -0.0297  0.0721  -0.1794  -0.0788  0.1275  0.4906  0.0079
## s.e.  0.0400   0.0394  0.0390   0.1345   0.0590  0.0420  0.1449  0.0921
##         sar3     sma1
##       0.1161  -1.0000
## s.e.  0.0385   0.0254
## 
## sigma^2 estimated as 1468702:  log likelihood=-6329.37
## AIC=12696.74   AICc=12697.79   BIC=12784.32
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE    MAPE      MASE
## Training set 49.39993 1187.546 774.7905 -1.943759 11.3343 0.6757775
##                     ACF1
## Training set 0.002123919
coeftest(arima_best)
## 
## z test of coefficients:
## 
##        Estimate Std. Error  z value  Pr(>|z|)    
## ar1   0.3619035  0.0363633   9.9524 < 2.2e-16 ***
## ar2   0.1966896  0.0386960   5.0829 3.716e-07 ***
## ar3  -0.0045111  0.0389705  -0.1158 0.9078454    
## ar4   0.1051428  0.0405750   2.5913 0.0095608 ** 
## ar5  -0.1895744  0.0400635  -4.7318 2.225e-06 ***
## ar6  -0.0455860  0.0411107  -1.1089 0.2674909    
## ar7  -0.1095939  0.0404463  -2.7096 0.0067361 ** 
## ar8   0.0327651  0.0395156   0.8292 0.4070082    
## ar9   0.1165062  0.0399790   2.9142 0.0035662 ** 
## ar10 -0.0296690  0.0394257  -0.7525 0.4517331    
## ar11  0.0720647  0.0390268   1.8465 0.0648136 .  
## ar12 -0.1793525  0.1344794  -1.3337 0.1823089    
## ar13 -0.0787521  0.0590313  -1.3341 0.1821798    
## ar14  0.1274817  0.0420117   3.0344 0.0024099 ** 
## sar1  0.4906213  0.1449486   3.3848 0.0007123 ***
## sar2  0.0078645  0.0920640   0.0854 0.9319243    
## sar3  0.1161093  0.0385295   3.0135 0.0025824 ** 
## sma1 -0.9999946  0.0253807 -39.3998 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Although there are some insignificant intermediate lags, we decided to keep up to 14th lags as it improve RMSE and BIC alot.

Box.test(resid(arima_best), type = "Ljung-Box",lag = 7)
## 
##  Box-Ljung test
## 
## data:  resid(arima_best)
## X-squared = 0.57685, df = 7, p-value = 0.9991

Reject the null hypothesis that there is auto-correlation in the data

plot_acf_pacf(arima_best)

There is no significant lags indicating there is auto-correlation amongs lags

6 weeks ahead forecast

we need to loop through all store to estimate the ARIMA coefficient for each store and to 6 weeks ahead forecast

store_list <- unique(rossmann1$Store)
test_date <- test1[test1$Store==1,]$Date

predict_df <- data.frame()

training <- FALSE
if (training){
    for (i in 1:length(store_list)){#
    print(i)
    store_i <- train1%>%filter(Store==store_list[i])
    # if the data is misisng, only use data from last couple months
    if (store_i$missing[1]==1){
      store_i <- store_i%>%filter(Date > "2015-01-01")
    }
    # no_sunday sales model estimation
    if (store_i$Sunday_sales[1]==0){
        store_i <- store_i%>%filter(DayOfWeek !=7)
        tryCatch(
          #expr
          {
            model_i <- Arima(store_i[,'Sales'],
                             order = c(14,0,0),
                             seasonal=list(order=c(3,1,1),period=12),
                             include.constant = FALSE, method = 'CSS')
          },error= function(cond){
            message('change the optimisation method to avoid some numerial problem with the default method')
            message(cond)
            model_i <- Arima(store_i[,'Sales'],
                             order = c(14,0,0),
                             seasonal=list(order=c(3,1,1),period=12),
                             include.constant = FALSE,
                             method = 'CSS')
          }, warning = function(w){
            message(w)
          }, finally = {
            print('passed')
          }
        )
        
        predict_i <- forecast(model_i, h =6*6)$mean
        # manually insert 0 for sunday sales
        predict_i <- c(predict_i[1],0,
                     predict_i[2:7],0,
                     predict_i[8:13],0,
                     predict_i[14:19],0,
                     predict_i[20:25],0,
                     predict_i[26:31],0,
                     predict_i[32:36])
    }else{ # Sunday sales
      
      tryCatch(
        #expr
        {
          model_i <- Arima(store_i[,'Sales'],
                           order = c(15,0,0),
                           seasonal=list(order=c(3,1,1),period=14),
                           include.constant = FALSE,method = 'CSS')
          
        },error= function(cond){
          message('change the optimisation method to avoid some numerial problem with the default method')
          message(cond)
          model_i <- Arima(store_i[,'Sales'],
                           order = c(15,0,0),
                           seasonal=list(order=c(3,1,1),period=14),
                           include.constant = FALSE,
                           method = 'CSS') 
        }, warning = function(w){
          message(w)
        }, finally = {
          print('passed')
        }
      )
      predict_i <- forecast(model_i, h =7*6)$mean
    }
      predict_df_i <- as.data.frame(cbind(store_list[i], predict_i))
      colnames(predict_df_i) <- c('Store','Forecast_Sales')
      predict_df_i$Date <- test_date
      predict_df <- rbind(predict_df,predict_df_i)
  }
  ARIMA_preditc_df <- predict_df%>%left_join(test[,c('Date','Sales', 'Store')],
                                                   by =c('Store','Date'))
  
  saveRDS(ARIMA_preditc_df,'ARIMA_preditc_df.rds')
}

ARIMA_preditc_df <- readRDS('ARIMA_preditc_df.rds')

Evaludate the performance

#suppressWarnings(suppressMessages(library('performanceEstimation')))
regressionMetrics(real =ARIMA_preditc_df$Sales,
                  predicted =ARIMA_preditc_df$Forecast_Sales)
##       MSE     RMSE      MAE    MedAE        R2
## 1 4832634 2198.325 1102.916 685.6279 0.6519236

As ARIMA is not good for a quite long period ahead forecast (RMSE is quite high compared with other machine learning models). Lets see how the error evolves over time

ARIMA_preditc_df%>%
  group_by(Date)%>%
  summarise(rmse = regressionMetrics(real =Sales,
                                     predicted = Forecast_Sales)[,2])%>%
  ggplot(aes(x=Date, y = rmse))+geom_line()+
  ggtitle('rmse over time')

It can be seen that RMSE keeps increasing over time, so its not a very good idea or fair to use ARIMA to forecast a long period ahead.

4. CONCLUSION AND DISCUSSION OF DRAWBACKS

SUMMARY

ARIMA <-readRDS('ARIMA_preditc_df.rds')
XGBoost_pred <-readRDS('predictions.Rds')
## Error in gzfile(file, "rb"): cannot open the connection
test <- rossmann%>%       
  group_by(Store)%>%
  filter(Date > split_date)

result <- rossmann%>%
  filter(Date > split_date)%>%
  dplyr::select(Date, Store, Sales)
## Adding missing grouping variables: `Year`
result$lm1 <- model1.pred
result$lm_sig <- model1A.pred
result$step_wise<-model1B.pred
result$regTree <- model2.pred
result$regTree_stw <-  model2B.pred
result$XGBoost <- XGBoost_pred
## Error in eval(expr, envir, enclos): object 'XGBoost_pred' not found
result$ridge <-  model5.pred
result$lasso <-  model6.pred
result$ridge_crt <-  model8
result$lasso_crt <-  model9
result <- result%>%left_join(ARIMA[,c('Store','Date','Forecast_Sales')])
## Joining, by = c("Date", "Store")
result <- as.data.frame(result)
names(result)[15] <- 'ARIMA'
## Error in names(result)[15] <- "ARIMA": 'names' attribute [15] must be the same length as the vector [14]
result[,5:15] <- apply(result[,5:15],1:2, function(x) max(x,0))
## Error in `[.data.frame`(result, , 5:15): undefined columns selected
result <- result%>%
  mutate(avg_all = rowMeans(.[,5:15]),
         avg_best =  rowMeans(.[,c('XGBoost', 'step_wise','lm_sig','lm1','lasso')]))
## Error in mutate_impl(.data, dots): Evaluation error: undefined columns selected.
result <- as.data.frame(result)

pef_metrics <- data.frame()
model_names <-names(result)[5:17]

for (i in 1:length(model_names)){
  pef <-  regressionMetrics(real = result$Sales ,
                            predicted = result[,model_names[i]])
  pef$model_name <- model_names[i]
  pef_metrics <- rbind(pef_metrics,pef) 
  
}
## Error in `[.data.frame`(result, , model_names[i]): undefined columns selected
pef_metrics <- pef_metrics%>%arrange(RMSE)
ggplot(pef_metrics, aes(x=  reorder(model_name, -RMSE),y=RMSE))+
  geom_bar(stat = 'identity')+
  xlab('Model name')+
  ylab('RMSE')+
  ggtitle('Performance of various models on test set - RMSE')+
  coord_flip()

ggplot(pef_metrics, aes(x=  reorder(model_name, -R2),y=R2))+
  geom_bar(stat = 'identity')+
  xlab('Model name')+
  ylab('R2')+
  ggtitle('Performance of various models on test set -R2')+
  coord_flip()

SUMARRY

*To sum up, XGBoost has the best predictive performance. It has the highest R2 = 92.8 % and lowest root mean square error equal to 1004.16. We were a litlle bit suprised, that Regression Trees have the worst results.

*We found, that ARIMA is only good for few period ahead forecast (the result in the test set is worse than it is in the train set.). The longer we forecast the less accurate it is, as the model’s prediction will simply converge to the unconditional mean.